# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Figures inline and set visualization style
%matplotlib inline
sns.set()
from google.colab import files
uploaded = files.upload()
import io
df_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
df_train.head(n=4)
uploaded = files.upload()
df_test = pd.read_csv(io.BytesIO(uploaded['test.csv']))
df_test.head(n=4)
SalePrice_train = df_train.SalePrice
data = pd.concat([df_train.drop(['SalePrice'], axis=1), df_test])
df_train.info()
df_train.SalePrice.describe()
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.histplot(df_train['SalePrice'],kde=True)
plt.title("Histogram for SalePrice")
# Skew and kurt
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
Figure 1.1: distribution of the dependent variable sale prices
df_train.SalePrice.plot.box()
plt.tight_layout(pad=0.5)
Figure 1.2: box plot of the dependent variable sale prices
stats.probplot(df_train.SalePrice, plot=sns.mpl.pyplot)
Figure 1.3: Q-Q plot of the dependent variable sale prices
df_train.describe()
percent_missing = df_train.isnull().sum() * 100 / len(df_train)
df_train_missing_value = pd.DataFrame({'column_name': df_train.columns,'percent_missing': percent_missing})
df_train_missing_value = df_train_missing_value.sort_values('percent_missing',ascending=False)
df_train_missing_value[df_train_missing_value.percent_missing>0]
Figure 2.1: list of columns of the percentage of missing values
df_train_num_predictors = df_train_num.drop(['SalePrice'], axis=1)
print(df_train_num_predictors.shape)
df_train_num_predictors.hist(bins=10, figsize=(15, 20), layout=(6, 6));
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 150))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train_num_predictors), 1):
if(feature=='MiscVal'):
break
plt.subplot(len(list(df_train_num_predictors)), 3, i)
sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=df_train)
plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
plt.ylabel('SalePrice', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc='best', prop={'size': 10})
plt.show()
Figure 3.1: scatter plots of continuous variables versus the sale price
# Outliers
print(df_train[(df_train.GrLivArea>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train['1stFlrSF']>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train.LotFrontage>300)])
Figure 2.2: list of outliers
df_train_categorical = df_train.select_dtypes(exclude=np.number)
print("Categorical:", df_train_categorical.shape)
df_train_num = df_train.select_dtypes(include=np.number)
df_train_num = df_train_num.drop(['Id'], axis=1)
print("Numerical:", df_train_num.shape)
plt.subplots(figsize=(38, 38))
sns.heatmap(df_train_num.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')
Figure 3.2: correlation matrix
# visualising some more outliers in the data values
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
sns.countplot(x=var, data=df_train, ax=subplot)
Figure 3.3: bar plot of categorical variables
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
sns.boxplot(x=var, y='SalePrice', data=df_train, ax=subplot)
Figure 3.4: box plot of categorical variables
fig = plt.figure(figsize = (25,60))
sns.countplot(x='Neighborhood', data=df_train, ax=fig.add_subplot(6,1,1));
sns.boxplot(x='Neighborhood', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,2));
sns.countplot(x='Exterior1st', data=df_train, ax=fig.add_subplot(6,1,3));
sns.boxplot(x='Exterior1st', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,4));
sns.countplot(x='Exterior2nd', data=df_train, ax=fig.add_subplot(6,1,5));
sns.boxplot(x='Exterior2nd', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,6));
print(data.columns)
Figure 4.1: feature creation
data['TotalSF'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF']
data['TotalPorchSF'] = data['OpenPorchSF']+data['EnclosedPorch']+data['3SsnPorch']+data['ScreenPorch']+data['WoodDeckSF']
data['HouseAge'] = data.YrSold - data.YearBuilt
data['QualityIndex'] = data.OverallQual * data.OverallCond
data['Total_Bathrooms'] = data.BsmtFullBath + .5*data.BsmtHalfBath + data.FullBath + .5*data.HalfBath
data['Has_Fireplaces'] = np.where(data['Fireplaces']>=1, 1, 0)
data['Has_Bsmt'] = np.where(data['TotalBsmtSF']>=0, 1, 0)
data['Has_Garage'] = np.where(data['GarageArea']>=0, 1, 0)
data['Has_Pool'] = np.where(data['PoolArea']>=0, 1, 0)
data['Has_2ndStory'] = np.where(data['2ndFlrSF']>=0, 1, 0)
data.head()
Figure 4.2: imputation for missing data
for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu',
'GarageCond','GarageType','GarageFinish','GarageQual',
'BsmtFinType2','BsmtExposure','BsmtQual','BsmtCond',
'BsmtFinType1','MasVnrType','Electrical']:
data[col] = data[col].fillna('_NA_');
for col in ['MasVnrArea']:
data[col] = data[col].fillna(0);
for col in ['GarageYrBlt','LotFrontage']:
data[col] = data.groupby('Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
Figure 4.3: encode categorical variables
data = pd.get_dummies(data, columns=list(df_train_categorical.columns), drop_first=True)
data.head()
Figure 5.1: perform both min-max and standard scaling on the dependent variable
# log(1+x) transform
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
# define standard scaler
scaler = StandardScaler()
df_train["StandardScal_SalePrice"] = scaler.fit_transform(df_train[["SalePrice"]])
# define max-min scaler
scaler = MinMaxScaler()
df_train["MaxMinScal_SalePrice"] = scaler.fit_transform(df_train[["SalePrice"]])
df_train.head()